rm(list = ls())

# Note that this simulation takes about 1 hour and 40 minutes to run
# because it does 10,000 randomization inference tests,
# each with 1,000 permutations

wd <- ".../Replication/"
setwd(wd)

# Load/install packages --
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  foreign, 
  ggplot2, 
  estimatr,
  texreg,
  xtable,
  fastDummies,
  sandwich,
  dplyr,
  janitor, 
  gridExtra,
  gsheet,
  zoo,
  interflex,
  lubridate,
  tidyverse,
  stringi,
  readxl,
  ri2,
  modelsummary,
  ggpubr
)

options(scipen=999)

# Note: given the random component inherent to the randomization tests performed
# by ri(), you may observe small differences in the p-values of the randomization
# tests performed using that function.

# Load data ----
d <- read.csv("Data/baseline database.csv")
winners <- subset(d, d$group=="Liberado sorteo")
censo <- read.csv("Data/census_nonwhites.csv")

# remove from the census individuals with possible matches in treated group
censo$name1 = paste(censo$nombre,censo$apellido)
censo$name2 = paste(censo$nombre,censo$apellido_jefehogar)
censo$name_search = ifelse(is.na(censo$apellido)==TRUE,censo$name2,censo$name1)
censo$name_search = ifelse(is.na(censo$apellido)==TRUE & is.na(censo$apellido_jefehogar)==TRUE,NA,censo$name_search)
censo = subset(censo, is.na(censo$name_search)==FALSE)

possiblematches = NA

for(i in 1:nrow(winners)){
  list1 = agrep(winners$full_name[i], censo$name_search, max.distance =0.10, value = TRUE)
  possiblematches = c(possiblematches,list1)
}

censo2 <- subset(censo, !(censo$name_search %in% possiblematches))
enslaved <- subset(censo2, censo2$slave==1)
enslaved$lottery_winner <- 0
enslaved$imprisoned_dummy <- enslaved$imprisoned>0

win <- cbind.data.frame(winners$full_name,
                        winners$imprisoned_dummy,
                        winners$lottery_winner)


# create subsets of the census
all <- cbind.data.frame(enslaved$nombre,
                        enslaved$imprisoned_dummy,
                        enslaved$lottery_winner)


colnames(win) <- c("name","imprisoned_dummy","lottery_winner")
colnames(all) <- c("name","imprisoned_dummy","lottery_winner")

# declare RI design
declare.d1 <- declare_ra(N = 688, m=45)

# simulation
coef <- NA
exact.p <- NA

starttime = Sys.time()
set.seed(123)

for(i in 1:10000){
  
  # create synthetic control groups
  synth <- all[sample(x=1:nrow(all),size=643,replace=FALSE),]
  d2 <- rbind.data.frame(win,synth)
  
  # conduct randomization inference
  ri <- conduct_ri(
    formula = imprisoned_dummy ~ lottery_winner,
    declaration = declare.d1,
    sharp_hypothesis = 0,
    assignment = "lottery_winner",
    data = d2,
    sims = 1000
  )
  
  # save data
  coef[i] <- as.numeric(unlist(summary(ri)[2]))
  exact.p[i] <-  as.numeric(unlist(summary(ri)[3]))
  
  # print progress
  print(i)
  
}

endtime = Sys.time()

# save plot
pdf("Output/Figure3.pdf",height=5, width=10)
par(mfrow=c(1,2))
hist(coef, breaks = 10, main = "Distribution of difference in means", xlab = "")
hist(exact.p, breaks = 100, main = "Distribution of exact p values", xlab = "")
abline(v = 0.1, col = "red", lty = 2)
dev.off()

# descriptive stats
summary(coef)
mean(exact.p<0.1)
